#load necessary libraries
library(xgboost)
library(caret)
train=readRDS("train.rds")
test=readRDS("test.rds")
print(train)
print(test)
#train:test -> 80%:20%
train_train=train[1:(0.8*dim(train)[1]),]
test_train=train[(0.8*dim(train)[1]+1):(dim(train)[1]),]

Use caret to find the best hyperparameters using 5-fold cv

#set up grid to choose the best values for the following parameters
control <-trainControl(method="cv", number=5)
xgb_grid = expand.grid(
nrounds = 10,
eta = c(0.1, 0.05, 0.01),
max_depth = c(2, 3, 4, 5, 6),
gamma = 0,
colsample_bytree=1,
min_child_weight=c(1, 2, 3, 4 ,5),
subsample=1)
xgb_caret <- train(x=train_train[-230], y=train_train[,230], method='xgbTree', trControl= control, tuneGrid=xgb_grid) 
There were missing values in resampled performance measures.
xgb_caret$bestTune

#From the run, the best tune parameters are max depth = 2, eta = 0.1 and min_child_weight = 1

train_labels <- train_train[,230]
# put our testing & training data into two seperates Dmatrixs objects
dtrain <- xgb.DMatrix(data = as.matrix(train_train[,-230]), label= train_labels)
dtest <- xgb.DMatrix(data = as.matrix(test_train[,-230]))
#Using the best parameters from the tune
parameters <-list(
        objective = "reg:linear",
        booster = "gbtree",
        eta=0.01, #default = 0.3
        gamma=0,
        max_depth=4, #default=6
        min_child_weight=2, #default=1
        subsample=1,
        colsample_bytree=1
)
#cross validation using the inbuild xgb.cv() to find the best no of rounds. 
set.seed(123)
xgbcv <- xgb.cv( params = parameters, data = dtrain, nrounds = 10000, nfold = 5, showsd = T, stratified = T, print_every_n = 40, early_stopping_rounds = 10, maximize = F)
[1] train-rmse:11.413846+0.007614   test-rmse:11.413752+0.030885 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 10 rounds.

[41]    train-rmse:7.644597+0.005097    test-rmse:7.644520+0.032602 
[81]    train-rmse:5.122702+0.003400    test-rmse:5.123048+0.029993 
[121]   train-rmse:3.435296+0.002239    test-rmse:3.435759+0.026427 
[161]   train-rmse:2.306322+0.001437    test-rmse:2.307605+0.022512 
[201]   train-rmse:1.550925+0.000961    test-rmse:1.552953+0.018722 
[241]   train-rmse:1.045665+0.000752    test-rmse:1.048705+0.015871 
[281]   train-rmse:0.708045+0.000700    test-rmse:0.713113+0.013718 
[321]   train-rmse:0.482904+0.000802    test-rmse:0.491152+0.012104 
[361]   train-rmse:0.333550+0.000965    test-rmse:0.345948+0.010399 
[401]   train-rmse:0.235578+0.001194    test-rmse:0.253653+0.009072 
[441]   train-rmse:0.172388+0.001377    test-rmse:0.197499+0.008986 
[481]   train-rmse:0.132697+0.001564    test-rmse:0.165074+0.010620 
[521]   train-rmse:0.108632+0.001682    test-rmse:0.147400+0.012964 
[561]   train-rmse:0.094332+0.001771    test-rmse:0.138112+0.015068 
[601]   train-rmse:0.085758+0.001864    test-rmse:0.133278+0.016492 
[641]   train-rmse:0.080401+0.001925    test-rmse:0.130593+0.017600 
[681]   train-rmse:0.076742+0.001888    test-rmse:0.129021+0.018262 
[721]   train-rmse:0.074035+0.001825    test-rmse:0.127961+0.018783 
[761]   train-rmse:0.071941+0.001759    test-rmse:0.127210+0.019097 
[801]   train-rmse:0.070175+0.001732    test-rmse:0.126631+0.019305 
[841]   train-rmse:0.068606+0.001697    test-rmse:0.126181+0.019362 
[881]   train-rmse:0.067198+0.001641    test-rmse:0.125828+0.019394 
[921]   train-rmse:0.065869+0.001604    test-rmse:0.125473+0.019420 
[961]   train-rmse:0.064626+0.001572    test-rmse:0.125227+0.019502 
[1001]  train-rmse:0.063460+0.001568    test-rmse:0.124946+0.019540 
[1041]  train-rmse:0.062388+0.001617    test-rmse:0.124762+0.019490 
[1081]  train-rmse:0.061345+0.001632    test-rmse:0.124658+0.019473 
[1121]  train-rmse:0.060370+0.001645    test-rmse:0.124539+0.019446 
[1161]  train-rmse:0.059343+0.001610    test-rmse:0.124447+0.019409 
[1201]  train-rmse:0.058397+0.001612    test-rmse:0.124355+0.019398 
[1241]  train-rmse:0.057484+0.001613    test-rmse:0.124269+0.019407 
[1281]  train-rmse:0.056640+0.001594    test-rmse:0.124162+0.019396 
[1321]  train-rmse:0.055808+0.001611    test-rmse:0.124112+0.019350 
Stopping. Best iteration:
[1313]  train-rmse:0.055956+0.001618    test-rmse:0.124112+0.019363
#based on the best tune parameters and nrounds = 1032
xgb_mod <- xgb.train(data = dtrain, params= parameters, nrounds = 1032)
XGBpred <- predict(xgb_mod, dtest)
head(XGBpred)
[1] 12.16587 13.11739 11.89546 11.99993 11.99828 12.43549
predictions_XGB <-XGBpred #need to reverse the log to the real values
#evaluation of results
cor(predictions_XGB,test_train[,230])
[1] 0.9431406
rmse(test_train[,230],predictions_XGB)
[1] 0.1300017
#visualizing the results
plot(exp(predictions_XGB),exp(test_train[,230]),xlab="Predicted Label",ylab="Actual Label",main="Plot of Actual Against Predicted Labels")
lin.mod=lm(exp(test_train[,230])~exp(predictions_XGB))
pr.lm=predict(lin.mod)
lines(pr.lm~exp(predictions_XGB), col="blue", lwd=0.5)
lines(c(0,450000), c(0,450000))
legend("topleft", legend=c("fitted line", "45 degree line"),col=c("blue", "black"), lty=1, cex=0.8)

#view variable importance plot
#install.packages("Ckmeans.1d.dp")
library(Ckmeans.1d.dp) #required for ggplot clustering
mat <- xgb.importance (feature_names = colnames(train_train[,-230]),model = xgb_mod)
xgb.ggplot.importance(importance_matrix = mat[1:20], rel_to_first = TRUE)

LS0tDQp0aXRsZTogIlhHQm9vc3QiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cn0NCiNsb2FkIG5lY2Vzc2FyeSBsaWJyYXJpZXMNCmxpYnJhcnkoeGdib29zdCkNCmxpYnJhcnkoY2FyZXQpDQpgYGANCg0KYGBge3J9DQp0cmFpbj1yZWFkUkRTKCJ0cmFpbi5yZHMiKQ0KdGVzdD1yZWFkUkRTKCJ0ZXN0LnJkcyIpDQpwcmludCh0cmFpbikNCnByaW50KHRlc3QpDQoNCiN0cmFpbjp0ZXN0IC0+IDgwJToyMCUNCnRyYWluX3RyYWluPXRyYWluWzE6KDAuOCpkaW0odHJhaW4pWzFdKSxdDQp0ZXN0X3RyYWluPXRyYWluWygwLjgqZGltKHRyYWluKVsxXSsxKTooZGltKHRyYWluKVsxXSksXQ0KYGBgDQoNClVzZSBjYXJldCB0byBmaW5kIHRoZSBiZXN0IGh5cGVycGFyYW1ldGVycyB1c2luZyA1LWZvbGQgY3YNCmBgYHtyfQ0KI3NldCB1cCBncmlkIHRvIGNob29zZSB0aGUgYmVzdCB2YWx1ZXMgZm9yIHRoZSBmb2xsb3dpbmcgcGFyYW1ldGVycw0KDQpjb250cm9sIDwtdHJhaW5Db250cm9sKG1ldGhvZD0iY3YiLCBudW1iZXI9NSkNCg0KeGdiX2dyaWQgPSBleHBhbmQuZ3JpZCgNCm5yb3VuZHMgPSAxMCwNCmV0YSA9IGMoMC4xLCAwLjA1LCAwLjAxKSwNCm1heF9kZXB0aCA9IGMoMiwgMywgNCwgNSwgNiksDQpnYW1tYSA9IDAsDQpjb2xzYW1wbGVfYnl0cmVlPTEsDQptaW5fY2hpbGRfd2VpZ2h0PWMoMSwgMiwgMywgNCAsNSksDQpzdWJzYW1wbGU9MSkNCg0KeGdiX2NhcmV0IDwtIHRyYWluKHg9dHJhaW5fdHJhaW5bLTIzMF0sIHk9dHJhaW5fdHJhaW5bLDIzMF0sIG1ldGhvZD0neGdiVHJlZScsIHRyQ29udHJvbD0gY29udHJvbCwgdHVuZUdyaWQ9eGdiX2dyaWQpIA0KeGdiX2NhcmV0JGJlc3RUdW5lDQoNCmBgYA0KI0Zyb20gdGhlIHJ1biwgdGhlIGJlc3QgdHVuZSBwYXJhbWV0ZXJzIGFyZSBtYXggZGVwdGggPSAyLCBldGEgPSAwLjEgYW5kIG1pbl9jaGlsZF93ZWlnaHQgPSAgMQ0KDQpgYGB7cn0NCnRyYWluX2xhYmVscyA8LSB0cmFpbl90cmFpblssMjMwXQ0KDQojIHB1dCBvdXIgdGVzdGluZyAmIHRyYWluaW5nIGRhdGEgaW50byB0d28gc2VwZXJhdGVzIERtYXRyaXhzIG9iamVjdHMNCmR0cmFpbiA8LSB4Z2IuRE1hdHJpeChkYXRhID0gYXMubWF0cml4KHRyYWluX3RyYWluWywtMjMwXSksIGxhYmVsPSB0cmFpbl9sYWJlbHMpDQpkdGVzdCA8LSB4Z2IuRE1hdHJpeChkYXRhID0gYXMubWF0cml4KHRlc3RfdHJhaW5bLC0yMzBdKSkNCmBgYA0KDQpgYGB7cn0NCiNVc2luZyB0aGUgYmVzdCBwYXJhbWV0ZXJzIGZyb20gdGhlIHR1bmUNCnBhcmFtZXRlcnMgPC1saXN0KA0KICAgICAgICBvYmplY3RpdmUgPSAicmVnOmxpbmVhciIsDQogICAgICAgIGJvb3N0ZXIgPSAiZ2J0cmVlIiwNCiAgICAgICAgZXRhPTAuMSwgI2RlZmF1bHQgPSAwLjMNCiAgICAgICAgZ2FtbWE9MCwNCiAgICAgICAgbWF4X2RlcHRoPTIsICNkZWZhdWx0PTYNCiAgICAgICAgbWluX2NoaWxkX3dlaWdodD0xLCAjZGVmYXVsdD0xDQogICAgICAgIHN1YnNhbXBsZT0xLA0KICAgICAgICBjb2xzYW1wbGVfYnl0cmVlPTENCikNCmBgYA0KDQpgYGB7cn0NCiNjcm9zcyB2YWxpZGF0aW9uIHVzaW5nIHRoZSBpbmJ1aWxkIHhnYi5jdigpIHRvIGZpbmQgdGhlIGJlc3Qgbm8gb2Ygcm91bmRzLiANCnNldC5zZWVkKDEyMykNCnhnYmN2IDwtIHhnYi5jdiggcGFyYW1zID0gcGFyYW1ldGVycywgZGF0YSA9IGR0cmFpbiwgbnJvdW5kcyA9IDEwMDAwLCBuZm9sZCA9IDUsIHNob3dzZCA9IFQsIHN0cmF0aWZpZWQgPSBULCBwcmludF9ldmVyeV9uID0gNDAsIGVhcmx5X3N0b3BwaW5nX3JvdW5kcyA9IDEwLCBtYXhpbWl6ZSA9IEYpDQpgYGANCg0KYGBge3J9DQojYmFzZWQgb24gdGhlIGJlc3QgdHVuZSBwYXJhbWV0ZXJzIGFuZCBucm91bmRzID0gMTAzMg0KeGdiX21vZCA8LSB4Z2IudHJhaW4oZGF0YSA9IGR0cmFpbiwgcGFyYW1zPSBwYXJhbWV0ZXJzLCBucm91bmRzID0gMTAzMikNCg0KYGBgDQoNCmBgYHtyfQ0KWEdCcHJlZCA8LSBwcmVkaWN0KHhnYl9tb2QsIGR0ZXN0KQ0KaGVhZChYR0JwcmVkKQ0KDQpwcmVkaWN0aW9uc19YR0IgPC1YR0JwcmVkICNuZWVkIHRvIHJldmVyc2UgdGhlIGxvZyB0byB0aGUgcmVhbCB2YWx1ZXMNCg0KYGBgDQoNCmBgYHtyfQ0KI2V2YWx1YXRpb24gb2YgcmVzdWx0cw0KY29yKHByZWRpY3Rpb25zX1hHQix0ZXN0X3RyYWluWywyMzBdKQ0Kcm1zZSh0ZXN0X3RyYWluWywyMzBdLHByZWRpY3Rpb25zX1hHQikNCmBgYA0KDQpgYGB7cn0NCiN2aXN1YWxpemluZyB0aGUgcmVzdWx0cw0KcGxvdChleHAocHJlZGljdGlvbnNfWEdCKSxleHAodGVzdF90cmFpblssMjMwXSkseGxhYj0iUHJlZGljdGVkIExhYmVsIix5bGFiPSJBY3R1YWwgTGFiZWwiLG1haW49IlBsb3Qgb2YgQWN0dWFsIEFnYWluc3QgUHJlZGljdGVkIExhYmVscyIpDQpsaW4ubW9kPWxtKGV4cCh0ZXN0X3RyYWluWywyMzBdKX5leHAocHJlZGljdGlvbnNfWEdCKSkNCnByLmxtPXByZWRpY3QobGluLm1vZCkNCmxpbmVzKHByLmxtfmV4cChwcmVkaWN0aW9uc19YR0IpLCBjb2w9ImJsdWUiLCBsd2Q9MC41KQ0KbGluZXMoYygwLDQ1MDAwMCksIGMoMCw0NTAwMDApKQ0KDQpsZWdlbmQoInRvcGxlZnQiLCBsZWdlbmQ9YygiZml0dGVkIGxpbmUiLCAiNDUgZGVncmVlIGxpbmUiKSxjb2w9YygiYmx1ZSIsICJibGFjayIpLCBsdHk9MSwgY2V4PTAuOCkNCmBgYA0KDQpgYGB7ciwgb3V0LndpZHRoPSIxMDAlIn0NCiN2aWV3IHZhcmlhYmxlIGltcG9ydGFuY2UgcGxvdA0KI2luc3RhbGwucGFja2FnZXMoIkNrbWVhbnMuMWQuZHAiKQ0KbGlicmFyeShDa21lYW5zLjFkLmRwKSAjcmVxdWlyZWQgZm9yIGdncGxvdCBjbHVzdGVyaW5nDQptYXQgPC0geGdiLmltcG9ydGFuY2UgKGZlYXR1cmVfbmFtZXMgPSBjb2xuYW1lcyh0cmFpbl90cmFpblssLTIzMF0pLG1vZGVsID0geGdiX21vZCkNCnhnYi5nZ3Bsb3QuaW1wb3J0YW5jZShpbXBvcnRhbmNlX21hdHJpeCA9IG1hdFsxOjIwXSwgcmVsX3RvX2ZpcnN0ID0gVFJVRSkNCmBgYA0K